Meeting with John Dec 6th

Variable Permutations Methods

Within the training dataset each variable was permuted 10 times. These permutated datasets were then used to make a logistic model bagged 100 times. The bagged model was used to predict on the entire (unbalanced) training dataset. The AUC for each permuted dataset of a given variable was then averaged. This mean AUC was subtracted from the base model AUC (delta AUC). The delta AUC was rescaled max(delta AUC) to yield variable importance. This whole process took roughly 4 hours using 20 cores of the high memory machine.

PermFullModel <- readRDS("../data_out/TempSplit/PermFullModel.rds")

Variable Importance

The population size is the most importance variable.

#model
glm.formula <- as.formula("case~  
                                NDVI+NDVIScale+
                                popLog10+
                                RFsqrt+RFScale+
                                tempMean+tempScale+
                                fireDenSqrt+fireDenScale+
                                spRich+primProp") 

variables <- trimws(unlist(strsplit(as.character(glm.formula)[3], "+", fixed = T)), which = "both")
variablesName <- c("full model", variables, "all permutated")

#Plot relative Importance  
RelImp <- PermFullModel[[1]]
names(RelImp) <- variables
barplot(RelImp)

#rank relative importance
sort(RelImp)
##  fireDenSqrt         NDVI fireDenScale      RFScale    tempScale 
##   0.03680823   0.08309142   0.13261168   0.17530674   0.18917706 
##     primProp       RFsqrt     tempMean    NDVIScale       spRich 
##   0.19504424   0.23542957   0.23828224   0.30154515   0.54232322 
##     popLog10 
##   1.00000000

Absolute AUC

All variables were permuted simultaneously to get at the baseline AUC of the model approach. It is roughly 0.5 but has wide variability.

AUC <- PermFullModel[[2]]
AUC$meanAUC <- as.numeric(as.character(AUC$meanAUC))#stupid format
AUC$sdAUC <- as.numeric(as.character(AUC$sdAUC))

AUC[order(AUC$meanAUC, decreasing = TRUE),]
##             Model   meanAUC       sdAUC
## 1      full model 0.8471939 0.002230535
## 9     fireDenSqrt 0.8454479 0.002532170
## 2            NDVI 0.8432525 0.004200308
## 10   fireDenScale 0.8409035 0.003300597
## 6         RFScale 0.8388782 0.005135133
## 8       tempScale 0.8382203 0.003520869
## 12       primProp 0.8379419 0.004117191
## 5          RFsqrt 0.8360262 0.005619717
## 7        tempMean 0.8358909 0.004074073
## 3       NDVIScale 0.8328900 0.004739831
## 11         spRich 0.8214686 0.014090251
## 4        popLog10 0.7997585 0.016359660
## 13 all permutated 0.5181521 0.115364210
library(gplots)
    plotCI(x=c(1:length(AUC$Model)),y=AUC$meanAUC, uiw=AUC$sdAUC, gap=0, xaxt="n", xlab="Permuted Variable", ylab="Mean AUC (10 permutations)")
    axis(1, at=c(1:length(AUC$Model)), labels=AUC$Model)

Variable Key

  • case :reporting of any YF cases (0,1)

  • NDVI : NDVI for that month
  • NDVIScale : NDVI rescaled to max value for that muni and calendar month
  • popLog10 : population density
  • RFsqrt : mean hourly rainfall sqrt transformed
  • RFScale : mean hourly rainfall rescaled to max value for that muni and calendar month
  • tempMean : mean monthly air temperature
  • tempScale : mean monthly air temperature rescaled to max value for that muni and calendar month
  • fireDenSqrt : number of fires oberserved in month divided by muniArea sqrt transformed
  • firesDenScale : number of fires oberserved in month divided by muniArea rescaled to max value for that muni and calendar month. NA values converted to zero.
  • spRich : number of non-human primates by species with ranges based on IUCN {0-22}
  • primProp : sum of each municipalities relative area that is both agricultural and falls within a primate genus range. {0,9} Missing for 2014

Code to run permutations

# Bagging ----
bagging<-function(form.x.y,training,new.data){
  # modified JP's bagging function 12/1/17 RK 
  # form.x.y the formula for model to use
  # training dataframe containing training data (presence and abs)
  # new.data new data for logreg model to predict onto
  
  # returns predictions based on logreg model for new data and coefficients of model
  #0. load packages
  library(dplyr)
  #1. Create subset of data with fixed number of pres and abs
  training.pres <- dplyr::filter(training, case==1) #pull out just present points
  training.abs <- dplyr::filter(training, case==0)  #pull out just absence points
  training_positions.p <- sample(nrow(training.pres),size=10) #randomly choose 10 present point rows
  training_positions.b <- sample(nrow(training.abs),size=100) #randomly choose 100 absence point rows  
  train_pos.p<-1:nrow(training.pres) %in% training_positions.p #presence 
  train_pos.b<-1:nrow(training.abs) %in% training_positions.b #background
  #2. Build logreg model with subset of data    
  glm_fit<-glm(form.x.y,data=rbind(training.pres[train_pos.p,],training.abs[train_pos.b,]),family=binomial(logit))
  #3. Pull out model coefs  
  glm.coef <- coef(glm_fit)
  #4. Use model to predict (0,1) on whole training data   
  predictions <- predict(glm_fit,newdata=new.data,type="response")
  return(list(predictions, glm.coef))
}

# Permute Variable based on loop iteration of PermOneVar ----
permutedata=function(formula = glm.formula,trainingdata, i){
  # glm.formula:
  # training : training data with pres and abs
  # cores : number of cores to use for parallel; default to 2
  # no.iterations : number of low bias models to make; default to 100
  
  #parse out variables from formula object 
  variables <- trimws(unlist(strsplit(as.character(formula)[3], "+", fixed = T)), which = "both")
  variablesName <- c("full model", variables, "all permutated")
  
  #if statments to permute data as needed ----
    if(i==1){
      #run full model
      permuted.data <- trainingdata
    }else if(i==length(variablesName)){
      #permute all variables; using loop so can I can use same sampling method (apply statement coherced data into weird format)
      # temp.data <- dplyr::select(traindata, variables) %>%
      #   dplyr::sample_frac() 
      # permuted.data <- cbind(case=traindata$case, tmp.data)
      
      #bug: treating colnames as colnumber in fun but not when ran in console. :(   
      permuted.data <- trainingdata
      for( j in 1:length(variables)){
        vari <- variables[j]
        permuted.data[,vari] <- sample(permuted.data[,vari],dim(permuted.data)[1],FALSE) #permute the col named included in vari (ie. variable.names)
      }   
    } else {
      #permute single variable
      permuted.data <- trainingdata
      permuted.data[,variablesName[i]] <- sample(permuted.data[,variablesName[i]],dim(permuted.data)[1],FALSE) #permute the col named included in vari (ie. variable.names)
    } 
    
    return(permuted.data)
}
  

permOneVar=function(formula = glm.formula, bag.fnc=bagging,permute.fnc=permutedata, traindata = training,  cores=2, no.iterations= 100, perm=10){
  # glm.formula:
  # training : training data with pres and abs
  # cores : number of cores to use for parallel; default to 2
  # no.iterations : number of low bias models to make; default to 100
  library(dplyr)
  library(doParallel)
  
  #parse out variables from formula object 
  variables <- trimws(unlist(strsplit(as.character(formula)[3], "+", fixed = T)), which = "both")
  variablesName <- c("full model", variables, "all permutated")
  
  #make objects for outputs to be saved in
  perm.auc <- matrix(NA, nrow=perm, ncol=length(variablesName)) #place to save AUC of models based on different permuation
  
  for (j in 1:length(variablesName)){
    print(c(j,variablesName[j])) #let us know where the simulation is at. 
    VarToPerm <- j
    for (k in 1:perm){
    #if statments to permute data as needed, replaced with permuteddata fnc ----
        permuted.data <- permute.fnc(formula=formula, trainingdata= traindata, i=VarToPerm)  #permute variable of interest
        print(c(j,k))
      
    #Set up parallel cores to return 2 lists: 1) predictions, 2) coef of model ----
      cores.to.use <- cores #change dep on computer
      iterations <- no.iterations #number of low bias models to make
      
      #create class to combine multiple results
      multiResultClass <- function(predictions = NULL,coefs = NULL)
      {
        me <- list(
          predictions = predictions,
          coefs = coefs
        )
        
        ## Set the name for the class
        class(me) <- append(class(me),"multiResultClass")
        return(me)
      }
      
      
      cl <- makeCluster(cores.to.use)
      registerDoParallel(cl)
      
      trainModel <- foreach(i=1:iterations) %dopar% {
        result <- multiResultClass()
        output1 <- bag.fnc(form.x.y=formula,training= permuted.data ,new.data=traindata)
        result$predictions <-output1[[1]]
        #result$coefs <- output1[[2]]
        return(result)
      }
      
      stopCluster(cl)
      #aggregate data from clusters ----
      #pull out data in usable fashion
      trainingPreds <- do.call(cbind,(lapply(trainModel, '[[', 1)))
      #trainingCoefs <- do.call(cbind,(lapply(trainModel, '[[', 2)))
      
      output.preds<- apply(trainingPreds, 1, mean)
      preds <- prediction(output.preds, traindata$case) #other projects have used dismo::evaluate instead. Not sure if is makes a difference. 
      
      #matrix of AUC to return
      perm.auc[k,j] <- unlist(performance(preds, "auc")@y.values)
      
    }
  }
  #calculate relative importance
  perm.auc.mean <- apply(perm.auc,2,mean)
  perm.auc.sd <- apply(perm.auc, 2, sd)
  delta.auc <- perm.auc.mean[1] - perm.auc.mean[-c(1, length(perm.auc.mean))] #change in AUC from base model only for single variable permutation
  rel.import <- delta.auc/max(delta.auc) # normalized relative change in AUC from base model only for single variable permutation
  
  #Output for relative importance
  relative.import <- as.data.frame(cbind(Permutated=variables,RelImport=rel.import))
    #plot it for fun
  barplot(rel.import, names.arg = variables)
  #Output for mean and sd of permutations for all permutations (non, single var, and all var)
  mean.auc <- as.data.frame(cbind(Model=variablesName,meanAUC=perm.auc.mean, sdAUC=perm.auc.sd))
  
  #Output of AUC for each permutation
  colnames(perm.auc) <- variablesName
  
  #return training coefs and AUC for each iteration
  #return(list(train.auc, Coefs))
  return(list(rel.import, mean.auc,perm.auc))
}

Further exploring permutations

Permutation Methods

The methods for the workflow have not changed with the exception of increasing the number of bags from 100 to 1000. This analysis explores different variable combinations. Permutation3, the final model was permuted a total of 100 times (instead of the standard 10).

Permutation 1: Popless

Drop One model exploration showed that population was the main explanatory variable. However, the models with population permutated (100bags, 10 perm) did not have a large drop in AUC. For this reason, we should explore model permutations after dropping population.

#drop one covar importance
glm.formula.popless <- as.formula("case~  NDVI+NDVIScale+ 
                          RFsqrt+RFScale+
                          tempMean+tempScale+
                          fireDenSqrt+fireDenScale+
                          spRich+primProp") 

PoplessVariables <- trimws(unlist(strsplit(as.character(glm.formula.popless)[3], "+", fixed = T)), which = "both")
#increased number of bagged models to 1000 from 100
#PermPoplessModel <- permOneVar(formula = glm.formula.popless,traindata = training.data, cores=20, no.iterations = 1000, perm = 10 ) 

The stdev around the AUC for the fully permutated model is reassuring. The model performance is slightly impacted by removing population. The order of relative importance for the environmental covariates hasn’t changed.

##             Model meanAUC sdAUC
## 12 all permutated   0.514 0.086
## 10         spRich   0.775 0.014
## 3       NDVIScale   0.776 0.012
## 4          RFsqrt   0.795 0.007
## 6        tempMean   0.798 0.004
## 11       primProp   0.800 0.006
## 7       tempScale   0.802 0.003
## 2            NDVI   0.803 0.003
## 9    fireDenScale   0.805 0.009
## 5         RFScale   0.806 0.002
## 8     fireDenSqrt   0.807 0.004
## 1      full model   0.811 0.001

Permutation 2: Single flavor

Is the second environmental covariate really adding anything to the model? The more important of the two flavors of each covariate (based on Permutation 1) were used to build a low variable model. All fire covariates were removed as well.

#permute one covar importance, single flavor (SF) of each covariate
glm.formula.SF <- as.formula("case~  NDVIScale+ 
                          RFsqrt+
                          tempMean+
                          spRich") 

SingleVariables <- trimws(unlist(strsplit(as.character(glm.formula.SF)[3], "+", fixed = T)), which = "both")

#PermSFModel <- permOneVar(formula = glm.formula.SF,traindata = training.data, cores=20, no.iterations = 1000, perm = 10 ) #increased number of bagged models to 1000 from 100

Again, the model performance isn’t strongly impacted by removing variables. However, the all permutated model has a slighted elevated mean AUC and larger error.

##            Model meanAUC sdAUC
## 6 all permutated   0.547 0.132
## 5         spRich   0.744 0.029
## 3         RFsqrt   0.767 0.015
## 4       tempMean   0.777 0.011
## 2      NDVIScale   0.781 0.009
## 1     full model   0.801 0.000

Permutation 3: Single flavor with population

Before running this model, we felt like this should be the final model. That along with the variation of the all permutated model of Permuation 2 we increased the number of permutations was increased from 10 to 100.

#permute one covar importance, single flavor (SF) of each covariate
glm.formula.SFP <- as.formula("case~  popLog10+ 
                          NDVIScale+ 
                          RFsqrt+
                          tempMean+
                          spRich") 

SFPVariables <- trimws(unlist(strsplit(as.character(glm.formula.SFP)[3], "+", fixed = T)), which = "both")

#PermSFPModel <- permOneVar(formula = glm.formula.SFP,traindata = training.data, cores=20, no.iterations = 1000, perm = 100 ) #increased number of bagged models to 1000 from 100

This code took about 3.5 days to run. The model AUC looks good. The variability around the all permutated model is still high. I wonder if this is expected in models with fewer variables?

##            Model meanAUC sdAUC
## 7 all permutated   0.549 0.132
## 6         spRich   0.794 0.028
## 2       popLog10   0.813 0.018
## 4         RFsqrt   0.816 0.011
## 3      NDVIScale   0.830 0.007
## 5       tempMean   0.835 0.004
## 1     full model   0.848 0.000

Exploring Predictions and Results

Map of Predictions over time

Map of predictions and known cases by month (created via mappingPredictions.Rmd). This is only on the training data (hence the blank municipalities), and includes known true cases as green diamonds.

Ranking of Predictions

We want to explore how the predictions rank, noting where known positives fall out. This plot plots until the lowest ranked known positive.

predictions <- as.data.frame(readRDS("../data_out/Predictions/SFPPredPlot.rds"))

minTrue <- min(predictions$prediction[predictions$case == 1])
predRank <- predictions %>%
  #filter(prediction>minTrue) %>% #trim off predictions below the last true postive
  arrange(desc(prediction))

predRank$index <- seq.int(nrow(predRank))

ggplot(data = predRank[predRank$case == 0,], aes(y = prediction, x = index)) +
  geom_point(color = "gray80") +
  geom_point(data = predRank[predRank$case == 1,], aes(y = prediction, x = index), color = "maroon") +
  ylab("Prediction")+
  xlab("Rank order")+
  theme_bw() +
  ggtitle("Rank Order of ALL Training Data")

truePos <- predictions[predictions$case==1,]

bgPreds <- predictions[predictions$case==0,]

The lowest prediction for a known case is 0.02. 622720 instances fall above this, out of a total of 622720. The mean value given to true positives is 0.317 0.267 sd, compared to 0.073 0.107 sd for the background points.

Histograms of the predictions are as follows:

hist(truePos$prediction, main = "Predictions on True Positives")

hist(bgPreds$prediction, main = "Predictions on Background")

Feedback from John Dec 15th

  • Is the point process clumped?
    • Include a map with all the infected municipalities. The more occurances, the hotter or larger the point.
  • Are there different processes influencing spillover/reporting in rural vs urban areas?
    • Create 2 models using data split into develped/undeveloped. (Undeveloped includes State of Amazonas, Roraima, etc.)
    • Compare rank order of variable importance between the two models.
  • Model discrimination (categorizing true positive) vs calibration (raw value of the model output)
    • We should focus on discrimination (id. the most risky area, then the next, etc.) not calibration (this area is X%. this area is Y%, etc.).
    • The model is not returning raw probabilities but something proportional to the intensity of events. The probability would be derived by multiplying the model output by the ratio of cases to background points.
    • The rank order of the model output has an interesting shape unlike other models (didn’t ask follow up on that). Would be interesting to see all the points plotted (done).

Comparing variable importance of one model vs two models

Methods

John suggested we explore the possibility of two different point processes for yellow fever spillover in Brazil. We split the country into two region- the northwest with high primate diversity (\(\geq 6\) of 22) and the remaining portion with low primate diversity. A threshold of 6 species produced the most contiguous region of high primate diversity. The previous iterations of this work was using a temporially balanced split for training and testing data. This split was redone to create a balanced split in 3 dimensions (x space, y space and time). The newly space and time balanced data for the whole country (used for the one model approach) was then split into high and low non-human primate diversity regions (used for the two model approach).

Models consisted of 500 bags and 10 permutations unless otherwise stated. This bag/permutation scheme was a based on computational time and memory.

## Reading layer `BRAZpolygons' from data source `/Users/renikaul/Documents/YF_Brazil/data_clean' using driver `ESRI Shapefile'
## Simple feature collection with 5561 features and 2 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -73.99442 ymin: -33.75206 xmax: -28.83588 ymax: 5.271807
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs

Rank Order of Variable Importance

Models were built using all of the covariates (full.formula) and just environmental covariates (popless.formula).

full.formula <- as.formula("case~  NDVI+NDVIScale+
                          popLog10+
                          RF+RFsqrt+RFScale+
                          tempMean+tempScale+
                          fireNum+fireDen+fireDenSqrt+fireDenScale+
                          spRich+primProp") 


 popless.formula <- as.formula("case~  NDVI+NDVIScale+ 
                          RFsqrt+RFScale+
                          tempMean+tempScale+
                          fireDenSqrt+fireDenScale+
                          spRich+primProp") 

Effect of Spatially Balanced Split on Single Point Process Model

https://datascience.blog.wzb.eu/2016/09/27/parallel-coordinate-plots-for-discrete-and-categorical-data-in-r-a-comparison/

#load all the data 
STPermFullModel <- readRDS("../data_out/SpaceAndTimeSplit/PermFullModel500.rds")
STPermPoplessModel <- readRDS("../data_out/SpaceAndTimeSplit/PermPoplessModel500.rds")
STFull<- RankImp(STPermFullModel)
STPopless<- RankImp(STPermPoplessModel)



STPermSFModel <- readRDS("../data_out/SpaceAndTimeSplit/PermSFModel500.rds")

#Temperal split
PermFullModel <- readRDS("../data_out/TempSplit/PermFullModel.rds")
PermPoplessModel <- readRDS("../data_out/TempSplit/PermPoplessModel.rds")

STFull<- RankImp(STPermFullModel)
Full <- RankImp(PermFullModel)
STPopless<- RankImp(STPermPoplessModel)
STSF <- RankImp(STPermSFModel)


#This is the important chunck of the code 
VariRank <-  STFull %>%
              select(c(Model, STFull=Rank)) %>%
              dplyr::left_join(Full %>% select(c(Model, TFull=Rank)), by="Model")